###############################################################################   
#### Replication Materials                                                 #### 
#### Taegyoon Kim, 2022. Violent Political Rhetoric on Twitter.            ####
#### Political Science Research and Methods                                ####
###############################################################################  


###############################################################################
################################### Set Up ####################################
###############################################################################

# packages -------------------------

lapply(
  c('tidyr', 'dplyr', 'ggplot2', 'ggrepel', 'knitr', 'gridExtra', 'ggthemes',
    'readr', 'quanteda', 'tm','wordtools', 'xtable', 'scales', 'magrittr'), 
  require, 
  character.only = TRUE
  )

# load Fightin' Words function -------------------------

# Note: Define fw functions first. This is based on <Fightin' Words: Lexical
# Feature Selection and Evaluation for Identifying the Content of Political Conflict>. 
# Relevant original codes can be found here: 
# https://burtmonroe.github.io/TextAsDataCourse/Tutorials/TADA-FightinWords.nb.html

fwgroups <- function(dtm, 
                     groups, 
                     pair = NULL, 
                     weights = rep(1, nrow(dtm)), 
                     k.prior = .1) {
  
  weights[is.na(weights)] <- 0
  
  weights <- weights / mean(weights)
  
  zero.doc <- rowSums(dtm) == 0 | weights == 0
  zero.term <- colSums(dtm[!zero.doc, ]) == 0
  
  dtm.nz <- apply(dtm[!zero.doc,!zero.term], 2, '*', weights[!zero.doc])
  
  g.prior <- tcrossprod(rowSums(dtm.nz), colSums(dtm.nz)) / sum(dtm.nz)
  
  # 
  
  g.posterior <- as.matrix(dtm.nz + k.prior * g.prior)
  
  groups <- groups[!zero.doc]
  groups <- droplevels(groups)
  
  g.adtm <- as.matrix(aggregate(x = g.posterior, by = list(groups = groups), FUN = sum)[, -1])
  rownames(g.adtm) <- levels(groups)
  
  g.ladtm <- log(g.adtm)
  
  g.delta <- t(scale( t(scale(g.ladtm, center = T, scale = F)), center = T, scale = F))
  
  g.adtm_w <- -sweep(g.adtm, 1, rowSums(g.adtm)) # terms not w spoken by k
  g.adtm_k <- -sweep(g.adtm, 2, colSums(g.adtm)) # w spoken by groups other than k
  g.adtm_kw <- sum(g.adtm) - g.adtm_w - g.adtm_k - g.adtm # total terms not w or k 
  
  g.se <- sqrt(1 / g.adtm + 1 / g.adtm_w + 1 / g.adtm_k + 1 / g.adtm_kw)
  
  g.zeta <- g.delta / g.se
  
  g.counts <- as.matrix(aggregate(x = dtm.nz, by = list(groups = groups), FUN=sum)[, -1])
  
  if (!is.null(pair)) {
    pr.delta <- t(scale(t(scale(g.ladtm[pair,], center = T, scale = F)), center = T, scale = F))
    pr.adtm_w <- -sweep(g.adtm[pair, ], 1, rowSums(g.adtm[pair, ]))
    pr.adtm_k <- -sweep(g.adtm[pair, ], 2, colSums(g.adtm[pair, ])) # w spoken by groups other than k
    pr.adtm_kw <- sum(g.adtm[pair, ]) - pr.adtm_w - pr.adtm_k - g.adtm[pair,] # total terms not w or k
    pr.se <- sqrt(1 / g.adtm[pair, ] + 1/pr.adtm_w + 1 / pr.adtm_k + 1 / pr.adtm_kw)
    pr.zeta <- pr.delta / pr.se
    
    return(list(zeta = pr.zeta[1,], 
                delta=pr.delta[1, ], 
                se = pr.se[1, ], 
                counts = colSums(dtm.nz), 
                acounts = colSums(g.adtm)))
    } 
  else {
    return(list(zeta = g.zeta,
                delta = g.delta, 
                se = g.se, 
                counts = g.counts, 
                acounts = g.adtm))
    }
  }

makeTransparent <- function(someColor, alpha = 100){
  newColor <- col2rgb(someColor)
  apply(newColor, 2, function(curcoldata){rgb(red = curcoldata[1], green = curcoldata[2],
                                              blue = curcoldata[3], alpha = alpha, maxColorValue = 255)
    }
    )
  }

fw.ggplot.groups <- function(fw.ch, 
                             groups.use = as.factor(rownames(fw.ch$zeta)), 
                             max.words = 50, 
                             max.countrank = 400, 
                             colorpalette = rep('black',length(groups.use)), 
                             sizescale = 2, 
                             title='', 
                             subtitle = '', 
                             caption = '') {
  if (is.null(dim(fw.ch$zeta))) {## two-group fw object consists of vectors, not matrices
    zetarankmat <- cbind(rank(-fw.ch$zeta), rank(fw.ch$zeta))
    colnames(zetarankmat) <- groups.use
    countrank <- rank(-(fw.ch$counts))
    } 
  else {
    zetarankmat <- apply(-fw.ch$zeta[groups.use,],1,rank)
    countrank <- rank(-colSums(fw.ch$counts))
    }
  
  wideplotmat <- as_tibble(cbind(zetarankmat,countrank=countrank))
  wideplotmat$term = names(countrank)
  #rankplot <- gather(wideplotmat, party, zetarank, 1:ncol(zetarankmat))
  rankplot <- gather(wideplotmat, groups.use, zetarank, 1:ncol(zetarankmat))
  rankplot$plotsize <- sizescale*(50/(rankplot$zetarank)) ^ (1/4)
  rankplot <- rankplot[rankplot$zetarank < max.words + 1 & rankplot$countrank<max.countrank + 1,]
  rankplot$groups.use <- factor(rankplot$groups.use,levels = groups.use)
  
  p <- ggplot(rankplot, aes((nrow(rankplot) - countrank) ^ 1, -(zetarank ^ 1), colour = groups.use)) + 
    geom_point(show.legend = F, size = sizescale / 2) + 
    theme_classic() +
    ylim(-max.words, 40) +
    facet_grid(groups.use ~ .) +
    geom_text_repel(aes(label = term), max.overlaps = Inf, size = rankplot$plotsize, point.padding=.05,
                    box.padding = unit(0.20, 'lines'), show.legend=F) +
    scale_colour_manual(values = alpha(colorpalette, .9)) + 
    #labs(x='Terms used more frequently overall →', y='Terms used more frequently by group →',  title=title, subtitle=subtitle , caption = caption) 
    labs(x=paste('Overall Frequency'), y=paste('Type-specific Frequency'),  title = title, subtitle = subtitle , caption = caption) +
    theme(axis.ticks = element_blank(), 
          axis.text = element_blank(),
          plot.title = element_text(hjust = 0.5),
          text = element_text(size = 17.5),
          strip.text.x = element_text(size = 17.5)) 
  }

fw.keys <- function(fw.ch, n.keys = 10) {
  n.groups <- nrow(fw.ch$zeta)
  keys <- matrix('', n.keys, n.groups)
  colnames(keys) <- rownames(fw.ch$zeta)
  for (g in 1:n.groups) {
    keys[,g] <- names(sort(fw.ch$zeta[g, ], dec = T)[1:n.keys])
    }
  keys
  }


###############################################################################
############ Generate Figure 2 & Figure 4 & Table A6  & Table A10 #############
###############################################################################


# load data -------------------------

path_data <- '/kim_psrm_replication/data/' 
path_output <- '/kim_psrm_replication/output/' 

df_fw_notext <- read_csv(paste0(path_data, 'df_fw_notext.csv'), 
                  col_types = cols(X1 = col_skip()))
df_fw_dfm <- readRDS(file = paste0(path_data, 'df_fw_dfm.rds'))

df_user <- read_csv(paste0(path_data, 'df_user.csv'), 
                    col_types = cols(X1 = col_skip())) 


# text pr-processing -------------------------

# Note: Twitter restrics sharing of raw tweets, I am only able to share an RDS for
# the document-term matrix (i.e., df_fw_dfm.rds). The three blocks of code lines
# below were used to transform raw tweet text to a document-term matrix.
# The data frame, df_fw_notext, only contains the type of tweets (violent vs.
# non-violent) without the text and corresponds 

# df_fw$text_processed <- iconv(df_fw$text, # remove ampersands and emojis
#                             'latin1', 
#                             'ASCII', 
#                             sub = '') 
# df_fw$text_processed <- removeWords(df_fw$text_processed, 'amp')

#  text_tokens <- tokens( # tokenization
#    df_fw$text_processed,
#    remove_punct = TRUE,
#    remove_symbols = TRUE,
#    remove_numbers = TRUE,
#    remove_url = TRUE)

#  df_dfm <- dfm( # document-term matrix
#   text_tokens,
#   tolower = TRUE,
#   stem = TRUE,
#   remove = stopwords(),
#   case_insensitive = TRUE)


# figure 2 -------------------------

fw <- fwgroups(df_fw_dfm, groups = as.factor(df_fw_notext$type))

fw_plot <- fw.ggplot.groups(
  fw, 
  sizescale = 4,
  max.words = 90, 
  max.countrank = 400,
  colorpalette = c('dodgerblue3','red'))

fw_plot

ggsave(paste0(path_output, 'fig2.pdf'),
       dpi = 600,
       width = 12,
       height = 10,
       units = 'in')

# table a6 -------------------------

fw_keys <- fw.keys(fw, n.keys=30)
xtable(fw_keys)
write.csv(fw_keys, paste0(path_output, 'tbla6.csv'))

# figure 4 -------------------------

df_user$friends_count_one <- df_user$friends_count + 1
df_user$followers_count_one <- df_user$followers_count + 1
df_user$favourites_count_one <- df_user$favourites_count + 1
df_user$statuses_count_one <- df_user$statuses_count + 1

df_user$Type <- as.factor(ifelse(df_user$violent == 1, 'Violent', 'Non-violent'))
df_user$Type <- factor(df_user$Type, levels = c('Violent','Non-violent'))

ggplot(df_user, aes(x = friends_count_one)) + # figure 4a
  theme_bw() +
  geom_histogram(aes(fill = Type, y = ..density..), 
                 position = 'identity', 
                 binwidth = 0.1, 
                 alpha = 0.7) +
  scale_fill_manual(values = c('red', 'skyblue')) +
  scale_x_continuous(trans = 'log10', 
                     labels = label_number(accuracy = 1),
                     breaks = c(0, 1, 10, 100, 1000, 10000, 100000)) +
  labs(x = '\nNumber of Friends', 
       y = 'Density\n') +
  theme(legend.text = element_text(size = 40),
        legend.title = element_text(size = 40),
        legend.position = c(0.8, 0.8),
        axis.text.x = element_text(size = 40, angle = 0, vjust = 0, hjust = 0.5),
        axis.text.y = element_text(size = 40),
        axis.title = element_text(size = 40))

ggsave(paste0(path_output, 'fig4a.pdf'),
       dpi = 600,
       width = 20,
       height = 10,
       units = 'in')

ggplot(df_user, aes(x = followers_count_one)) + # figure 4b
  theme_bw() +
  geom_histogram(aes(fill = Type, y=..density..), 
                 position = 'identity', 
                 binwidth = 0.1, 
                 alpha = 0.7) +
  scale_fill_manual(values = c('red', 'skyblue')) +
  scale_x_continuous(trans='log10', 
                     breaks = c(0, 1, 10, 100, 1000, 10000)) +
  labs(x = '\nNumber of Followers', 
       y = 'Density\n') +
  theme(legend.text = element_text(size = 40),
        legend.title = element_text(size = 40),
        legend.position = c(0.8, 0.8),
        axis.text.x = element_text(size = 40, angle = 0, vjust = 0, hjust=0.5),
        axis.text.y = element_text(size = 40),
        axis.title = element_text(size = 40)) 

ggsave(paste0(path_output, 'fig4b.pdf'),
       dpi = 600,
       width = 20,
       height = 10,
       units = 'in')

ggplot(df_user, aes(x = favourites_count_one)) + # figure 4c
  theme_bw() +
  geom_histogram(aes(fill = Type, y = ..density..), 
                 position = 'identity', 
                 binwidth = 0.1, 
                 alpha = 0.7) +
  scale_fill_manual(values = c('red', 'skyblue')) +
  scale_x_continuous(trans = 'log10', 
                     labels = label_number(accuracy = 1),
                     breaks = c(0, 1, 10, 100, 1000, 10000, 100000)) +
  labs(x = '\nNumber of Likes', 
       y = 'Density\n') +
  theme(legend.text = element_text(size = 40),
        legend.title = element_text(size = 40),
        legend.position = c(0.2, 0.8),
        axis.text.x = element_text(size = 40, angle = 0, vjust = 0, hjust=0.5),
        axis.text.y = element_text(size = 40),
        axis.title = element_text(size = 40))

ggsave(paste0(path_output, 'fig4c.pdf'),
       dpi = 600,
       width = 20,
       height = 10,
       units = 'in')

ggplot(df_user, aes(x = statuses_count_one)) + # figure 4d
  theme_bw() +
  geom_histogram(aes(fill = Type, y = ..density..), 
                 position = 'identity',
                 binwidth = 0.1, 
                 alpha = 0.7) +
  scale_fill_manual(values = c('red', 'skyblue')) +
  scale_x_continuous(trans = 'log10', 
                     labels = label_number(accuracy = 1),
                     breaks = c(0, 1, 10, 100, 1000, 10000, 100000)) +
  labs(x = '\nNumber of Tweets', y = 'Density\n') +
  theme(legend.text = element_text(size = 40),
        legend.title = element_text(size = 40),
        legend.position = c(0.2, 0.8),
        axis.text.x = element_text(size = 40, angle = 0, vjust = 0, hjust = 0.5),
        axis.text.y = element_text(size = 40),
        axis.title = element_text(size = 40)) 

ggsave(paste0(path_output, 'fig4d.pdf'),
       dpi = 600,
       width = 20,
       height = 10,
       units = 'in')

# figure a10 -------------------------

user_median <- data.frame(
  Friends = 
    c(median(df_user[which(df_user$violent == 1), ]$friends_count), 
      median(df_user[which(df_user$violent == 0), ]$friends_count)),
  Followers = 
    c(median(df_user[which(df_user$violent == 1), ]$followers_count), 
      median(df_user[which(df_user$violent == 0), ]$followers_count)),
  Likes = 
    c(median(df_user[which(df_user$violent == 1), ]$favourites_count),
      median(df_user[which(df_user$violent == 0), ]$favourites_count)),
  Tweets = 
    c(median(df_user[which(df_user$violent == 1), ]$statuses_count),
      median(df_user[which(df_user$violent == 0), ]$statuses_count))
  )
row.names(user_median) <- c('Violent', 'Non-violent')
write.csv(user_median, paste0(path_output, 'tbla10.csv'))